# note the seprator in this file
import pandas as pd
data_text =pd.read_csv("training_text",sep="\|\|",engine="python",names=["ID","TEXT"],skiprows=1)
print('Number of data points : ', data_text.shape[0])
print('Number of features : ', data_text.shape[1])
print('Features : ', data_text.columns.values)
data_text.head()
data = pd.read_csv('training_variants')
print('Number of data points : ', data.shape[0])
print('Number of features : ', data.shape[1])
print('Features : ', data.columns.values)
data.head()
# pandas_profiling is an advanced/visual way of pd.describe() the complete html file is attached with the notebook
import pandas_profiling
pandas_profiling.ProfileReport(data)
pfr = pandas_profiling.ProfileReport(data)
pfr.to_file("cancer_profiling.html")
# loading necessary libraries:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
#from imblearn.over_sampling import SMOTE
from collections import Counter
from scipy.sparse import hstack
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import StratifiedKFold
from collections import Counter, defaultdict
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import math
from sklearn.metrics import normalized_mutual_info_score
from sklearn.ensemble import RandomForestClassifier
#warnings.filterwarnings("ignore")
from mlxtend.classifier import StackingClassifier
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
stop_words = set(stopwords.words('english'))
def nlp_preprocessing(total_text, index, column):
if type(total_text) is not int:
string = ""
# replace every special char with space
total_text = re.sub('[^a-zA-Z0-9\n]', ' ', total_text)
# replace multiple spaces with single space
total_text = re.sub('\s+',' ', total_text)
# converting all the chars into lower-case.
total_text = re.sub('\d+','',total_text)
total_text = total_text.lower()
for word in total_text.split():
# if the word is a not a stop word then retain that word from the data
if not word in stop_words:
string += word + " "
data_text[column][index] = string
#text processing stage.
start_time = time.clock()
for index, row in data_text.iterrows():
if type(row['TEXT']) is str:
nlp_preprocessing(row['TEXT'], index, 'TEXT')
else:
print("there is no text description for id:",index)
print('Time took for preprocessing the text :',time.clock() - start_time, "seconds")
#merging both gene_variations and text data based on ID
result = pd.merge(data, data_text,on='ID', how='left')
result.to_pickle('cancer_merged.pckl')
result[result.isnull().any(axis = 1)]
result.loc[result['TEXT'].isnull(),'TEXT'] = result['Gene'] +' '+result['Variation']
result.to_pickle('cancer_joined.pckl')
null_text_ids = [1109,1277,1407,1639,2755]
# split the data into test and train by maintaining same distribution of output varaible 'y_true' [stratify=y_true]
y_true = result['Class'].values
result.Gene = result.Gene.str.replace('\s+', '_')
result.Variation = result.Variation.str.replace('\s+', '_')
X_train, test_df, y_train, y_test = train_test_split(result, y_true, stratify=y_true, test_size=0.2,random_state = 0)
# split the train data into train and cross validation by maintaining same distribution of output varaible 'y_train' [stratify=y_train]
train_df, cv_df, y_train, y_cv = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2,random_state = 0)
#let's recheck our y's distribution in train,test and validation set
# it returns a dict, keys as class labels and values as the number of data points in that class
%matplotlib inline
train_class_distribution = train_df['Class'].value_counts().sortlevel()
test_class_distribution = test_df['Class'].value_counts().sortlevel()
cv_class_distribution = cv_df['Class'].value_counts().sortlevel()
my_colors = 'rgbkymc'
train_class_distribution.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Data points per Class')
plt.title('Distribution of yi in train data')
plt.grid()
plt.show()
# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
# -(train_class_distribution.values): the minus sign will give us in decreasing order
sorted_yi = np.argsort(-train_class_distribution.values)
for i in sorted_yi:
print('Number of data points in class', i+1, ':',train_class_distribution.values[i], '(', np.round((train_class_distribution.values[i]/train_df.shape[0]*100), 3), '%)')
print('-'*80)
my_colors = 'rgbkymc'
test_class_distribution.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Data points per Class')
plt.title('Distribution of yi in test data')
plt.grid()
plt.show()
# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
# -(train_class_distribution.values): the minus sign will give us in decreasing order
sorted_yi = np.argsort(-test_class_distribution.values)
for i in sorted_yi:
print('Number of data points in class', i+1, ':',test_class_distribution.values[i], '(', np.round((test_class_distribution.values[i]/test_df.shape[0]*100), 3), '%)')
print('-'*80)
my_colors = 'rgbkymc'
cv_class_distribution.plot(kind='bar')
plt.xlabel('Class')
plt.ylabel('Data points per Class')
plt.title('Distribution of yi in cross validation data')
plt.grid()
plt.show()
# ref: argsort https://docs.scipy.org/doc/numpy/reference/generated/numpy.argsort.html
# -(train_class_distribution.values): the minus sign will give us in decreasing order
sorted_yi = np.argsort(-train_class_distribution.values)
for i in sorted_yi:
print('Number of data points in class', i+1, ':',cv_class_distribution.values[i], '(', np.round((cv_class_distribution.values[i]/cv_df.shape[0]*100), 3), '%)')
# prediction using random model
# we need to generate 9 numbers and the sum of numbers should be 1
# one solution is to genarate 9 numbers and divide each of the numbers by their sum
# ref: https://stackoverflow.com/a/18662466/4084039
test_data_len = test_df.shape[0]
cv_data_len = cv_df.shape[0]
# we create a output array that has exactly same size as the CV data
cv_predicted_y = np.zeros((cv_data_len,9))
for i in range(cv_data_len):
rand_probs = np.random.rand(1,9)
cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15))
# Test-Set error.
#we create a output array that has exactly same as the test data
test_predicted_y = np.zeros((test_data_len,9))
for i in range(test_data_len):
rand_probs = np.random.rand(1,9)
test_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
print("Log loss on Test Data using Random Model",log_loss(y_test,test_predicted_y, eps=1e-15))
predicted_y =np.argmax(test_predicted_y, axis=1)
plot_confusion_matrix(y_test, predicted_y+1)
# Let's see how predictive is gene feature alone..
alpha = [10 ** x for x in range(-5, 1)] # hyperparam for SGD classifier.
# read more about SGDClassifier() at http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
# ------------------------------
# default parameters
# SGDClassifier(loss=’hinge’, penalty=’l2’, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None,
# shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate=’optimal’, eta0=0.0, power_t=0.5,
# class_weight=None, warm_start=False, average=False, n_iter=None)
# some of methods
# fit(X, y[, coef_init, intercept_init, …]) Fit linear model with Stochastic Gradient Descent.
# predict(X) Predict class labels for samples in X.
#-------------------------------
# video link:
#------------------------------
cv_log_error_array=[]
for i in alpha:
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
clf.fit(train_gene_feature_onehotCoding, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_gene_feature_onehotCoding, y_train)
predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding)
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(train_gene_feature_onehotCoding, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_gene_feature_onehotCoding, y_train)
predict_y = sig_clf.predict_proba(train_gene_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_gene_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_gene_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
alpha = [10 ** x for x in range(-5, 1)]
# lets' see how predictive variation feature alone
cv_log_error_array=[]
for i in alpha:
clf = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=42)
clf.fit(train_variation_feature_onehotCoding, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_variation_feature_onehotCoding, y_train)
predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding)
cv_log_error_array.append(log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
print('For values of alpha = ', i, "The log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(train_variation_feature_onehotCoding, y_train)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_variation_feature_onehotCoding, y_train)
predict_y = sig_clf.predict_proba(train_variation_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_variation_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_variation_feature_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
from sklearn.dummy import DummyClassifier
dmclf1 = DummyClassifier(strategy='uniform') #every class is pred with equal prob
dmclf2 = DummyClassifier(strategy='stratified')#respects the train-dist of classes
# one-hot encoding of Gene feature.
gene_vectorizer = CountVectorizer()
train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene'])
test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene'])
cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene'])
dmclf1.fit(train_gene_feature_onehotCoding,y_train)
y_pred = dmclf1.predict(test_gene_feature_onehotCoding)
accuracy_score(y_test,y_pred)
dmclf2.fit(train_gene_feature_onehotCoding,y_train)
y_pred = dmclf2.predict(test_gene_feature_onehotCoding)
accuracy_score(y_test,y_pred)
y_true = result.Class.values
# LET'S CREATE OUR TEXT FEATURES FOR FEEDING INTO BOOSTING LIBRARY - CATBOOST.Here i reduced the dimpensionality of
#tfidf features to 500 using truncated svd.
x_train,x_test,train_y,test_y = train_test_split(result, y_true, stratify=y_true, test_size=0.2,random_state = 0)
tf_text = TfidfVectorizer(max_features=30000,ngram_range=(1,2))
dm_red = TruncatedSVD(n_components=500)
tf_feat = tf_text.fit_transform(x_train.TEXT.values)
red_trn_feat = dm_red.fit_transform(tf_feat)
red_trn_features = pd.DataFrame(red_trn_feat)
red_trn_features.index = x_train.index
trns_tst = tf_text.transform(x_test.TEXT.values)
red_tst_feat = dm_red.transform(trns_tst)
red_tst_features = pd.DataFrame(red_tst_feat)
red_tst_features.index = x_test.index
train = pd.concat([x_train,red_trn_features],axis = 1)
test = pd.concat([x_test,red_tst_features],axis = 1)
train = train.drop(['ID','Class','TEXT'],axis = 1)
test = test.drop(['ID','Class','TEXT'],axis = 1)
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
train_cats(train)
x,_,nas = proc_df(train)
%%time
model = CatBoostClassifier(
custom_loss=['Accuracy'],
random_seed=42,
logging_level='Silent'
)
cv_params = model.get_params()
cv_params.update({
'loss_function': 'MultiClass'
})
categorical_features_indices = [0,1]
cv_data = cv(
Pool(x, train_y, cat_features=categorical_features_indices),
cv_params,
plot=True
)#This performs 3 fold cross validation withreal time plotting....
cv_data.head(4)
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
np.max(cv_data['test-Accuracy-mean']),
cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
np.argmax(cv_data['test-Accuracy-mean'])
))
print('Best validation MultiClass score: {:.2f}±{:.2f} on step {}'.format(
np.max(cv_data['test-MultiClass-mean']),
cv_data['test-MultiClass-std'][np.argmax(cv_data['test-MultiClass-mean'])],
np.argmax(cv_data['test-MultiClass-mean'])
))
apply_cats(test,train)
test_x,_,nas = proc_df(test)
test_x.head(2)
%%time
#let's do early stopping
# this prevents overfitting and also faster training ... this is the final model we will be using.
earlystop_params = cv_params.copy()
earlystop_params.update({
'od_type': 'Iter',
'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(Pool(x, train_y, cat_features=categorical_features_indices))
earlystop_model.best_score_
probs = earlystop_model.predict_proba(test_x)
probs.shape
#log loss
from sklearn.metrics import log_loss
y_true = np.zeros(probs.shape)
y_true[0,:]
for id,val in enumerate(test_y.tolist()):
y_true[id,val-1] = 1
log_loss(y_true,probs) # That's our score on test set ..............
feat_imp = pd.DataFrame(earlystop_model.feature_names_,index=earlystop_model.feature_importances_)
feat_imp.sort_index(inplace=True,ascending=False)
feat_imp.head(10)
model_select['Catboost'] = {'clf':earlystop_model,
'best_testloss':0.89}
len(x),len(train_y),len(test_x),len(test_y)
trn_x,crsvl_x,trn_y,crsvl_y = train_test_split(x,train_y,test_size = 0.2)
alpha = [10,50,100,200,500,1000]
max_depth = [2,3,5,10]
cv_log_error_array = []
for i in alpha:
for j in max_depth:
print("for n_estimators =", i,"and max depth = ", j)
clf = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=j, random_state=42, n_jobs=-1)
clf.fit(trn_x, trn_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(trn_x, trn_y)
sig_clf_probs = sig_clf.predict_proba(crsvl_x)
cv_log_error_array.append(log_loss(crsvl_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
print("Log Loss :",log_loss(crsvl_y, sig_clf_probs))
best_alpha = np.argmin(cv_log_error_array)
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/4)], criterion='gini', max_depth=max_depth[int(best_alpha%4)], random_state=42, n_jobs=-1)
clf.fit(trn_x, trn_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(trn_x, trn_y)
predict_y = sig_clf.predict_proba(trn_x)
print('For values of best alpha = ', alpha[int(best_alpha/4)], "The train log loss is:",log_loss(trn_y, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(crsvl_x)
print('For values of best alpha = ', alpha[int(best_alpha/4)], "The cross validation log loss is:",log_loss(crsvl_y, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_x)
print('For values of best alpha = ', alpha[int(best_alpha/4)], "The test log loss is:",log_loss(test_y, predict_y, labels=clf.classes_, eps=1e-15))
model_select['RandomForest_numerical_feat'] = {'clf':clf,'best_trainloss':0.55,
'best_crossvalloss':1.14,
'best_testloss':1.1}
# some utility functions these are not required for modern boosting libraries like catboost but nedded for
#random forest...i'am not sure if i want to try random forests here but neverthless....but for handling edge cases
#like presence of new categories in test set or none values...
#from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn.ensemble import forest
def train_cats(df):
"""Change any columns of strings in a panda's dataframe to a column of
categorical values. This applies the changes inplace.
Parameters:
-----------
df: A pandas dataframe. Any columns of strings will be changed to
categorical values.
Examples:
---------
>>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
note the type of col2 is string
>>> train_cats(df)
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
now the type of col2 is category
"""
for n,c in df.items():
if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()
def apply_cats(df, trn):
"""Changes any columns of strings in df into categorical variables using trn as
a template for the category codes.
Parameters:
-----------
df: A pandas dataframe. Any columns of strings will be changed to
categorical values. The category codes are determined by trn.
trn: A pandas dataframe. When creating a category for df, it looks up the
what the category's code were in trn and makes those the category codes
for df.
Examples:
---------
>>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
note the type of col2 is string
>>> train_cats(df)
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
now the type of col2 is category {a : 1, b : 2}
>>> df2 = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['b', 'a', 'a']})
>>> apply_cats(df2, df)
col1 col2
0 1 b
1 2 a
2 3 a
now the type of col is category {a : 1, b : 2}
"""
for n,c in df.items():
if (n in trn.columns) and (trn[n].dtype.name=='category'):
df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)
def fix_missing(df, col, name, na_dict):
""" Fill missing data in a column of df with the median, and add a {name}_na column
which specifies if the data was missing.
Parameters:
-----------
df: The data frame that will be changed.
col: The column of data to fix by filling in missing data.
name: The name of the new filled column in df.
na_dict: A dictionary of values to create na's of and the value to insert. If
name is not a key of na_dict the median will fill any missing data. Also
if name is not a key of na_dict and there is no missing data in col, then
no {name}_na column is not created.
Examples:
---------
>>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
>>> df
col1 col2
0 1 5
1 nan 2
2 3 2
>>> fix_missing(df, df['col1'], 'col1', {})
>>> df
col1 col2 col1_na
0 1 5 False
1 2 2 True
2 3 2 False
>>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
>>> df
col1 col2
0 1 5
1 nan 2
2 3 2
>>> fix_missing(df, df['col2'], 'col2', {})
>>> df
col1 col2
0 1 5
1 nan 2
2 3 2
>>> df = pd.DataFrame({'col1' : [1, np.NaN, 3], 'col2' : [5, 2, 2]})
>>> df
col1 col2
0 1 5
1 nan 2
2 3 2
>>> fix_missing(df, df['col1'], 'col1', {'col1' : 500})
>>> df
col1 col2 col1_na
0 1 5 False
1 500 2 True
2 3 2 False
"""
if is_numeric_dtype(col):
if pd.isnull(col).sum() or (name in na_dict):
df[name+'_na'] = pd.isnull(col)
filler = na_dict[name] if name in na_dict else col.median()
df[name] = col.fillna(filler)
na_dict[name] = filler
return na_dict
def get_nn_mappers(df, cat_vars, contin_vars):
# Replace nulls with 0 for continuous, "" for categorical.
for v in contin_vars: df[v] = df[v].fillna(df[v].max()+100,)
for v in cat_vars: df[v].fillna('#NA#', inplace=True)
# list of tuples, containing variable and instance of a transformer for that variable
# for categoricals, use LabelEncoder to map to integers. For continuous, standardize
cat_maps = [(o, LabelEncoder()) for o in cat_vars]
contin_maps = [([o], StandardScaler()) for o in contin_vars]
return DataFrameMapper(cat_maps).fit(df), DataFrameMapper(contin_maps).fit(df)
def scale_vars(df, mapper):
warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
if mapper is None:
map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
mapper = DataFrameMapper(map_f).fit(df)
df[mapper.transformed_names_] = mapper.transform(df)
return mapper
def numericalize(df, col, name, max_n_cat):
""" Changes the column col from a categorical type to it's integer codes.
Parameters:
-----------
df: A pandas dataframe. df[name] will be filled with the integer codes from
col.
col: The column you wish to change into the categories.
name: The column name you wish to insert into df. This column will hold the
integer codes.
max_n_cat: If col has more categories than max_n_cat it will not change the
it to its integer codes. If max_n_cat is None, then col will always be
converted.
Examples:
---------
>>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
note the type of col2 is string
>>> train_cats(df)
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
now the type of col2 is category { a : 1, b : 2}
>>> numericalize(df, df['col2'], 'col3', None)
col1 col2 col3
0 1 a 1
1 2 b 2
2 3 a 1
"""
if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
df[name] = col.cat.codes+1
def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
""" proc_df takes a data frame df and splits off the response variable, and
changes the df into an entirely numeric dataframe.
Parameters:
-----------
df: The data frame you wish to process.
y_fld: The name of the response variable
skip_flds: A list of fields that dropped from df.
ignore_flds: A list of fields that are ignored during processing.
do_scale: Standardizes each column in df. Takes Boolean Values(True,False)
na_dict: a dictionary of na columns to add. Na columns are also added if there
are any missing values.
preproc_fn: A function that gets applied to df.
max_n_cat: The maximum number of categories to break into dummy values, instead
of integer codes.
subset: Takes a random subset of size subset from df.
mapper: If do_scale is set as True, the mapper variable
calculates the values used for scaling of variables during training time (mean and standard deviation).
Returns:
--------
[x, y, nas, mapper(optional)]:
x: x is the transformed version of df. x will not have the response variable
and is entirely numeric.
y: y is the response variable
nas: returns a dictionary of which nas it created, and the associated median.
mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous
variables which is then used for scaling of during test-time.
Examples:
---------
>>> df = pd.DataFrame({'col1' : [1, 2, 3], 'col2' : ['a', 'b', 'a']})
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
note the type of col2 is string
>>> train_cats(df)
>>> df
col1 col2
0 1 a
1 2 b
2 3 a
now the type of col2 is category { a : 1, b : 2}
>>> x, y, nas = proc_df(df, 'col1')
>>> x
col2
0 1
1 2
2 1
>>> data = DataFrame(pet=["cat", "dog", "dog", "fish", "cat", "dog", "cat", "fish"],
children=[4., 6, 3, 3, 2, 3, 5, 4],
salary=[90, 24, 44, 27, 32, 59, 36, 27])
>>> mapper = DataFrameMapper([(:pet, LabelBinarizer()),
([:children], StandardScaler())])
>>>round(fit_transform!(mapper, copy(data)), 2)
8x4 Array{Float64,2}:
1.0 0.0 0.0 0.21
0.0 1.0 0.0 1.88
0.0 1.0 0.0 -0.63
0.0 0.0 1.0 -0.63
1.0 0.0 0.0 -1.46
0.0 1.0 0.0 -0.63
1.0 0.0 0.0 1.04
0.0 0.0 1.0 0.21
"""
if not ignore_flds: ignore_flds=[]
if not skip_flds: skip_flds=[]
if subset: df = get_sample(df,subset)
else: df = df.copy()
ignored_flds = df.loc[:, ignore_flds]
df.drop(ignore_flds, axis=1, inplace=True)
if preproc_fn: preproc_fn(df)
if y_fld is None: y = None
else:
if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
y = df[y_fld].values
skip_flds += [y_fld]
df.drop(skip_flds, axis=1, inplace=True)
if na_dict is None: na_dict = {}
else: na_dict = na_dict.copy()
na_dict_initial = na_dict.copy()
for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
if len(na_dict_initial.keys()) > 0:
df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
if do_scale: mapper = scale_vars(df, mapper)
for n,c in df.items(): numericalize(df, c, n, max_n_cat)
df = pd.get_dummies(df, dummy_na=True)
df = pd.concat([ignored_flds, df], axis=1)
res = [df, y, na_dict]
if do_scale: res = res + [mapper]
return res
# code for response coding with Laplace smoothing.
# alpha : used for laplace smoothing
# feature: ['gene', 'variation']
# df: ['train_df', 'test_df', 'cv_df']
# algorithm
# ----------
# Consider all unique values and the number of occurances of given feature in train data dataframe
# build a vector (1*9) , the first element = (number of times it occured in class1 + 10*alpha / number of time it occurred in total data+90*alpha)
# gv_dict is like a look up table, for every gene it store a (1*9) representation of it
# for a value of feature in df:
# if it is in train data:
# we add the vector that was stored in 'gv_dict' look up table to 'gv_fea'
# if it is not there is train:
# we add [1/9, 1/9, 1/9, 1/9,1/9, 1/9, 1/9, 1/9, 1/9] to 'gv_fea'
# return 'gv_fea'
# ----------------------
# get_gv_fea_dict: Get Gene varaition Feature Dict
def get_gv_fea_dict(alpha, feature, df):
# value_count: it contains a dict like
# print(train_df['Gene'].value_counts())
# output:
# {BRCA1 174
# TP53 106
# EGFR 86
# BRCA2 75
# PTEN 69
# KIT 61
# BRAF 60
# ERBB2 47
# PDGFRA 46
# ...}
# print(train_df['Variation'].value_counts())
# output:
# {
# Truncating_Mutations 63
# Deletion 43
# Amplification 43
# Fusions 22
# Overexpression 3
# E17K 3
# Q61L 3
# S222D 2
# P130S 2
# ...
# }
value_count = train_df[feature].value_counts()
# gv_dict : Gene Variation Dict, which contains the probability array for each gene/variation
gv_dict = dict()
# denominator will contain the number of time that particular feature occured in whole data
for i, denominator in value_count.items():
# vec will contain (p(yi==1/Gi) probability of gene/variation belongs to perticular class
# vec is 9 diamensional vector
vec = []
for k in range(1,10):
# print(train_df.loc[(train_df['Class']==1) & (train_df['Gene']=='BRCA1')])
# ID Gene Variation Class
# 2470 2470 BRCA1 S1715C 1
# 2486 2486 BRCA1 S1841R 1
# 2614 2614 BRCA1 M1R 1
# 2432 2432 BRCA1 L1657P 1
# 2567 2567 BRCA1 T1685A 1
# 2583 2583 BRCA1 E1660G 1
# 2634 2634 BRCA1 W1718L 1
# cls_cnt.shape[0] will return the number of rows
cls_cnt = train_df.loc[(train_df['Class']==k) & (train_df[feature]==i)]
# cls_cnt.shape[0](numerator) will contain the number of time that particular feature occured in whole data
vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha))
# we are adding the gene/variation to the dict as key and vec as value
gv_dict[i]=vec
return gv_dict
# Get Gene variation feature
def get_gv_feature(alpha, feature, df):
# print(gv_dict)
# {'BRCA1': [0.20075757575757575, 0.03787878787878788, 0.068181818181818177, 0.13636363636363635, 0.25, 0.19318181818181818, 0.03787878787878788, 0.03787878787878788, 0.03787878787878788],
# 'TP53': [0.32142857142857145, 0.061224489795918366, 0.061224489795918366, 0.27040816326530615, 0.061224489795918366, 0.066326530612244902, 0.051020408163265307, 0.051020408163265307, 0.056122448979591837],
# 'EGFR': [0.056818181818181816, 0.21590909090909091, 0.0625, 0.068181818181818177, 0.068181818181818177, 0.0625, 0.34659090909090912, 0.0625, 0.056818181818181816],
# 'BRCA2': [0.13333333333333333, 0.060606060606060608, 0.060606060606060608, 0.078787878787878782, 0.1393939393939394, 0.34545454545454546, 0.060606060606060608, 0.060606060606060608, 0.060606060606060608],
# 'PTEN': [0.069182389937106917, 0.062893081761006289, 0.069182389937106917, 0.46540880503144655, 0.075471698113207544, 0.062893081761006289, 0.069182389937106917, 0.062893081761006289, 0.062893081761006289],
# 'KIT': [0.066225165562913912, 0.25165562913907286, 0.072847682119205295, 0.072847682119205295, 0.066225165562913912, 0.066225165562913912, 0.27152317880794702, 0.066225165562913912, 0.066225165562913912],
# 'BRAF': [0.066666666666666666, 0.17999999999999999, 0.073333333333333334, 0.073333333333333334, 0.093333333333333338, 0.080000000000000002, 0.29999999999999999, 0.066666666666666666, 0.066666666666666666],
# ...
# }
gv_dict = get_gv_fea_dict(alpha, feature, df)
# value_count is similar in get_gv_fea_dict
value_count = train_df[feature].value_counts()
# gv_fea: Gene_variation feature, it will contain the feature for each feature value in the data
gv_fea = []
# for every feature values in the given data frame we will check if it is there in the train data then we will add the feature to gv_fea
# if not we will add [1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9] to gv_fea
for index, row in df.iterrows():
if row[feature] in dict(value_count).keys():
gv_fea.append(gv_dict[row[feature]])
else:
gv_fea.append([1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9,1/9])
# gv_fea.append([-1,-1,-1,-1,-1,-1,-1,-1,-1])
return gv_fea
# alpha is used for laplace smoothing
alpha = 1
# train gene feature
train_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", train_df))
# test gene feature
test_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", test_df))
# cross validation gene feature
cv_variation_feature_responseCoding = np.array(get_gv_feature(alpha, "Variation", cv_df))
# one-hot encoding of variation feature.
variation_vectorizer = CountVectorizer()
train_variation_feature_onehotCoding = variation_vectorizer.fit_transform(train_df['Variation'])
test_variation_feature_onehotCoding = variation_vectorizer.transform(test_df['Variation'])
cv_variation_feature_onehotCoding = variation_vectorizer.transform(cv_df['Variation'])
#response-coding of the Gene feature
# alpha is used for laplace smoothing
alpha = 1
# train gene feature
train_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", train_df))
# test gene feature
test_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", test_df))
# cross validation gene feature
cv_gene_feature_responseCoding = np.array(get_gv_feature(alpha, "Gene", cv_df))
# one-hot encoding of Gene feature.
gene_vectorizer = CountVectorizer()
train_gene_feature_onehotCoding = gene_vectorizer.fit_transform(train_df['Gene'])
test_gene_feature_onehotCoding = gene_vectorizer.transform(test_df['Gene'])
cv_gene_feature_onehotCoding = gene_vectorizer.transform(cv_df['Gene'])
%%time
def extract_dictionary_paddle(cls_text):
dictionary = defaultdict(int)
for index, row in cls_text.iterrows():
for word in row['TEXT'].split():
dictionary[word] +=1
return dictionary
import math
#https://stackoverflow.com/a/1602964
def get_text_responsecoding(df):
text_feature_responseCoding = np.zeros((df.shape[0],9))
for i in range(0,9):
row_index = 0
for index, row in df.iterrows():
sum_prob = 0
for word in row['TEXT'].split():
sum_prob += math.log(((dict_list[i].get(word,0)+10 )/(total_dict.get(word,0)+90)))
text_feature_responseCoding[row_index][i] = math.exp(sum_prob/len(row['TEXT'].split()))
row_index += 1
return text_feature_responseCoding
# building a CountVectorizer with all the words that occured minimum 3 times in train data
text_vectorizer = CountVectorizer(ngram_range=(1,2),max_features=30000)
train_text_feature_onehotCoding = text_vectorizer.fit_transform(train_df['TEXT'])
train_text_features= text_vectorizer.get_feature_names()
# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
train_text_fea_counts = train_text_feature_onehotCoding.sum(axis=0).A1
# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts))
print("Total number of unique words in train data :", len(train_text_features))
%%time
# building a TFIDFVECTORIZER KEEPING ONLY TOP 1000 FEATURES...
text_vectorizer = TfidfVectorizer(min_df=3,max_features=1000)
train_text_feature_tfidfCoding = text_vectorizer.fit_transform(train_df['TEXT'])
train_text_features= text_vectorizer.get_feature_names()
# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
train_text_fea_counts = train_text_feature_tfidfCoding.sum(axis=0).A1
# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
text_fea_dict = dict(zip(list(train_text_features),train_text_fea_counts))
print("Total number of unique words in train data :", len(train_text_features))
dict_list = []
# dict_list =[] contains 9 dictoinaries each corresponds to a class
for i in range(1,10):
cls_text = train_df[train_df['Class']==i]
# build a word dict based on the words in that class
dict_list.append(extract_dictionary_paddle(cls_text))
# append it to dict_list
# dict_list[i] is build on i'th class text data
# total_dict is buid on whole training text data
total_dict = extract_dictionary_paddle(train_df)
confuse_array = []
for i in train_text_features:
ratios = []
max_val = -1
for j in range(0,9):
ratios.append((dict_list[j][i]+10 )/(total_dict[i]+90))
confuse_array.append(ratios)
confuse_array = np.array(confuse_array)
#response coding of text features
train_text_feature_responseCoding = get_text_responsecoding(train_df)
test_text_feature_responseCoding = get_text_responsecoding(test_df)
cv_text_feature_responseCoding = get_text_responsecoding(cv_df)
# NORMALIZING EACH ROW....
train_text_feature_responseCoding = (train_text_feature_responseCoding.T/train_text_feature_responseCoding.sum(axis=1)).T
test_text_feature_responseCoding = (test_text_feature_responseCoding.T/test_text_feature_responseCoding.sum(axis=1)).T
cv_text_feature_responseCoding = (cv_text_feature_responseCoding.T/cv_text_feature_responseCoding.sum(axis=1)).T
# don't forget to normalize every feature
train_text_feature_tdfidCoding = normalize(train_text_feature_tfidfCoding, axis=0)
# we use the same vectorizer that was trained on train data
test_text_feature_tfidfCoding = text_vectorizer.transform(test_df['TEXT'])
# don't forget to normalize every feature
test_text_feature_tfidfCoding = normalize(test_text_feature_tfidfCoding, axis=0)
# we use the same vectorizer that was trained on train data
cv_text_feature_tfidfCoding = text_vectorizer.transform(cv_df['TEXT'])
# don't forget to normalize every feature
cv_text_feature_tfidfCoding = normalize(cv_text_feature_tfidfCoding, axis=0)
# now let's get all the features in place:
train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding))
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding))
train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_tfidfCoding)).tocsr()
train_y = np.array(list(train_df['Class']))
test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_tfidfCoding)).tocsr()
test_y = np.array(list(test_df['Class']))
cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_tfidfCoding)).tocsr()
cv_y = np.array(list(cv_df['Class']))
train_gene_var_responseCoding = np.hstack((train_gene_feature_responseCoding,train_variation_feature_responseCoding))
test_gene_var_responseCoding = np.hstack((test_gene_feature_responseCoding,test_variation_feature_responseCoding))
cv_gene_var_responseCoding = np.hstack((cv_gene_feature_responseCoding,cv_variation_feature_responseCoding))
train_x_responseCoding = np.hstack((train_gene_var_responseCoding, train_text_feature_responseCoding))
test_x_responseCoding = np.hstack((test_gene_var_responseCoding, test_text_feature_responseCoding))
cv_x_responseCoding = np.hstack((cv_gene_var_responseCoding, cv_text_feature_responseCoding))
result['TEXT'].head(2)
sentences = []
for text in result['TEXT']:
sentences.append(text.split())
len(sentences)
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
# C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
A =(((C.T)/(C.sum(axis=1))).T)
#divid each element of the confusion matrix with the sum of elements in that column
# C = [[1, 2],
# [3, 4]]
# C.T = [[1, 3],
# [2, 4]]
# C.sum(axis = 1) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
# C.sum(axix =1) = [[3, 7]]
# ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7]
# [2/3, 4/7]]
# ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3]
# [3/7, 4/7]]
# sum of row elements = 1
B =(C/C.sum(axis=0))
#divid each element of the confusion matrix with the sum of elements in that row
# C = [[1, 2],
# [3, 4]]
# C.sum(axis = 0) axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
# C.sum(axix =0) = [[4, 6]]
# (C/C.sum(axis=0)) = [[1/4, 2/6],
# [3/4, 4/6]]
labels = [1,2,3,4,5,6,7,8,9]
# representing A in heatmap format
print("-"*20, "Confusion matrix", "-"*20)
plt.figure(figsize=(20,7))
sns.heatmap(C, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("-"*20, "Precision matrix (Columm Sum=1)", "-"*20)
plt.figure(figsize=(20,7))
sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
# representing B in heatmap format
print("-"*20, "Recall matrix (Row sum=1)", "-"*20)
plt.figure(figsize=(20,7))
sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, clf):
clf.fit(train_x, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x, train_y)
pred_y = sig_clf.predict(test_x)
# for calculating log_loss we willl provide the array of probabilities belongs to each class
print("Log loss :",log_loss(test_y, sig_clf.predict_proba(test_x)))
# calculating the number of data points that are misclassified
print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0])
plot_confusion_matrix(test_y, pred_y)
def report_log_loss(train_x, train_y, test_x, test_y, clf):
clf.fit(train_x, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x, train_y)
sig_clf_probs = sig_clf.predict_proba(test_x)
return log_loss(test_y, sig_clf_probs, eps=1e-15)
# this function will be used just for naive bayes
# for the given indices, we will print the name of the features
# and we will check whether the feature present in the test point text or not
def get_impfeature_names(indices, text, gene, var, no_features):
gene_count_vec = CountVectorizer()
var_count_vec = CountVectorizer()
text_count_vec = CountVectorizer(min_df=3)
gene_vec = gene_count_vec.fit(train_df['Gene'])
var_vec = var_count_vec.fit(train_df['Variation'])
text_vec = text_count_vec.fit(train_df['TEXT'])
fea1_len = len(gene_vec.get_feature_names())
fea2_len = len(var_count_vec.get_feature_names())
word_present = 0
for i,v in enumerate(indices):
if (v < fea1_len):
word = gene_vec.get_feature_names()[v]
yes_no = True if word == gene else False
if yes_no:
word_present += 1
print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no))
elif (v < fea1_len+fea2_len):
word = var_vec.get_feature_names()[v-(fea1_len)]
yes_no = True if word == var else False
if yes_no:
word_present += 1
print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no))
else:
word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)]
yes_no = True if word in text.split() else False
if yes_no:
word_present += 1
print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no))
print("Out of the top ",no_features," features ", word_present, "are present in query point")
# Running MULTINOMIALNB
alpha = [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
cv_log_error_array = []
for i in alpha:
print("for alpha =", i)
clf = MultinomialNB(alpha=i)
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
# to avoid rounding error while multiplying probabilites we use log-probability estimates
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
fig, ax = plt.subplots()
ax.plot(np.log10(alpha), cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]),cv_log_error_array[i]))
plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(cv_log_error_array)
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)
predict_y = sig_clf.predict_proba(train_x_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_x_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_x_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
# let's save it for future reference
model_select = {}
model_select['MultinomialNB'] = {'clf':clf,'best_trainloss':0.55,'best_crossvalloss':1.18,'best_testloss':1.16}
from sklearn.externals import joblib
joblib.dump(clf,'MulNB_cancer.joblib')
# ----------------------------
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
# to avoid rounding error while multiplying probabilites we use log-probability estimates
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
print("Number of missclassified point :", np.count_nonzero((sig_clf.predict(cv_x_onehotCoding)- cv_y))/cv_y.shape[0])
plot_confusion_matrix(cv_y, sig_clf.predict(cv_x_onehotCoding.toarray()))
#feat importance of correctly classifed point
test_point_index = 1
no_feature = 100
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
print("Predicted Class :", predicted_cls[0])
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
print("Actual Class :", test_y[test_point_index])
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
print("-"*50)
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
#feat imp of incorrectly classifed point
test_point_index = 100
no_feature = 100
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
print("Predicted Class :", predicted_cls[0])
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
print("Actual Class :", test_y[test_point_index])
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
print("-"*50)
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
# Lets' run knn
alpha = [5, 11, 15, 21, 31, 41, 51, 99]
cv_log_error_array = []
for i in alpha:
print("for alpha =", i)
clf = KNeighborsClassifier(n_neighbors=i)
clf.fit(train_x_responseCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_responseCoding, train_y)
sig_clf_probs = sig_clf.predict_proba(cv_x_responseCoding)
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
# to avoid rounding error while multiplying probabilites we use log-probability estimates
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(cv_log_error_array)
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
clf.fit(train_x_responseCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_responseCoding, train_y)
predict_y = sig_clf.predict_proba(train_x_responseCoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_x_responseCoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_x_responseCoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
model_select['Knn'] = {'clf':clf,'best_trainloss':0.65,
'best_crossvalloss':1.05,
'best_testloss':1.04}
# testing wit best hyperparmeters
clf = KNeighborsClassifier(n_neighbors=alpha[best_alpha])
predict_and_plot_confusion_matrix(train_x_responseCoding, train_y, cv_x_responseCoding, cv_y, clf)
%%time
#building countvectorizer with ngram.. for logistic regression...
text_vectorizer = CountVectorizer(min_df=3,ngram_range=(1,2))
train_text_feature_bowCoding = text_vectorizer.fit_transform(train_df['TEXT'])
train_text_features_bow= text_vectorizer.get_feature_names()
# train_text_feature_onehotCoding.sum(axis=0).A1 will sum every row and returns (1*number of features) vector
train_text_fea_counts = train_text_feature_bowCoding.sum(axis=0).A1
# zip(list(text_features),text_fea_counts) will zip a word with its number of times it occured
text_fea_dict = dict(zip(list(train_text_features_bow),train_text_fea_counts))
print("Total number of unique words in train data :", len(train_text_features_bow))
# don't forget to normalize every feature
train_text_feature_bowCoding = normalize(train_text_feature_bowCoding, axis=0)
# we use the same vectorizer that was trained on train data
test_text_feature_bowCoding = text_vectorizer.transform(test_df['TEXT'])
# don't forget to normalize every feature
test_text_feature_bowCoding = normalize(test_text_feature_bowCoding, axis=0)
# we use the same vectorizer that was trained on train data
cv_text_feature_bowCoding = text_vectorizer.transform(cv_df['TEXT'])
# don't forget to normalize every feature
cv_text_feature_tfidfCoding = normalize(cv_text_feature_bowCoding, axis=0)
# now let's get all the features in place:
train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding))
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding))
train_x_onehotCoding_tbow = hstack((train_gene_var_onehotCoding, train_text_feature_bowCoding)).tocsr()
train_y = np.array(list(train_df['Class']))
test_x_onehotCoding_tbow = hstack((test_gene_var_onehotCoding, test_text_feature_bowCoding)).tocsr()
test_y = np.array(list(test_df['Class']))
cv_x_onehotCoding_tbow = hstack((cv_gene_var_onehotCoding, cv_text_feature_bowCoding)).tocsr()
cv_y = np.array(list(cv_df['Class']))
# logistic regression with class balancing
alpha = [10 ** x for x in range(-6, 3)]
cv_log_error_array = []
for i in alpha:
print("for alpha =", i)
clf = SGDClassifier(class_weight='balanced', alpha=i, penalty='l2', loss='log', random_state=42)
clf.fit(train_x_onehotCoding_tbow, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding_tbow, train_y)
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding_tbow)
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
# to avoid rounding error while multiplying probabilites we use log-probability estimates
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(cv_log_error_array)
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
clf.fit(train_x_onehotCoding_tbow, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding_tbow, train_y)
predict_y = sig_clf.predict_proba(train_x_onehotCoding_tbow)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_x_onehotCoding_tbow)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_x_onehotCoding_tbow)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
model_select['LogisticRegression_bow'] = {'clf':clf,'best_trainloss':1.36,
'best_crossvalloss':2.29,
'best_testloss':1.59}
# TESTING THE MODEL WITH BEST HYPERPARAMETERS
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='log', random_state=42)
predict_and_plot_confusion_matrix(train_x_onehotCoding_tbow, train_y, cv_x_onehotCoding_tbow, cv_y, clf)
# linear svm
alpha = [10 ** x for x in range(-5, 3)]
cv_log_error_array = []
for i in alpha:
print("for C =", i)
# clf = SVC(C=i,kernel='linear',probability=True, class_weight='balanced')
clf = SGDClassifier( class_weight='balanced', alpha=i, penalty='l2', loss='hinge', random_state=42)
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],str(txt)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
best_alpha = np.argmin(cv_log_error_array)
# clf = SVC(C=i,kernel='linear',probability=True, class_weight='balanced')
clf = SGDClassifier(class_weight='balanced', alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42)
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)
predict_y = sig_clf.predict_proba(train_x_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_x_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_x_onehotCoding)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
model_select['Linear_svm'] = {'clf':clf,'best_trainloss':0.504,
'best_crossvalloss':1.12,
'best_testloss':1.1}
# Testing Model with best Hyperparmeters
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42,class_weight='balanced')
predict_and_plot_confusion_matrix(train_x_onehotCoding, train_y,cv_x_onehotCoding,cv_y, clf)
clf = SGDClassifier(alpha=alpha[best_alpha], penalty='l2', loss='hinge', random_state=42)
clf.fit(train_x_onehotCoding,train_y)
test_point_index = 1
# test_point_index = 100
no_feature = 500
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
print("Predicted Class :", predicted_cls[0])
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
print("Actual Class :", test_y[test_point_index])
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
print("-"*50)
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
# for incorrectly classified point
test_point_index = 100
no_feature = 500
predicted_cls = sig_clf.predict(test_x_onehotCoding[test_point_index])
print("Predicted Class :", predicted_cls[0])
print("Predicted Class Probabilities:", np.round(sig_clf.predict_proba(test_x_onehotCoding[test_point_index]),4))
print("Actual Class :", test_y[test_point_index])
indices = np.argsort(-clf.coef_)[predicted_cls-1][:,:no_feature]
print("-"*50)
get_impfeature_names(indices[0], test_df['TEXT'].iloc[test_point_index],test_df['Gene'].iloc[test_point_index],test_df['Variation'].iloc[test_point_index], no_feature)
# Testing the model wih random forests
alpha = [100,200,500,1000,2000]
max_depth = [5, 10]
cv_log_error_array = []
for i in alpha:
for j in max_depth:
print("for n_estimators =", i,"and max depth = ", j)
clf = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=j, random_state=42, n_jobs=-1)
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)
sig_clf_probs = sig_clf.predict_proba(cv_x_onehotCoding)
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
'''fig, ax = plt.subplots()
features = np.dot(np.array(alpha)[:,None],np.array(max_depth)[None]).ravel()
ax.plot(features, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[int(i/2)],max_depth[int(i%2)],str(txt)), (features[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
'''
best_alpha = np.argmin(cv_log_error_array)
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42, n_jobs=-1)
clf.fit(train_x_onehotCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_onehotCoding, train_y)
predict_y = sig_clf.predict_proba(train_x_onehotCoding)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_x_onehotCoding)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_x_onehotCoding)
print('For values of best estimator = ', alpha[int(best_alpha/2)], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
model_select['RandomForest'] = {'clf':clf,'best_trainloss':0.85,
'best_crossvalloss':1.24,
'best_testloss':1.19}
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/2)], criterion='gini', max_depth=max_depth[int(best_alpha%2)], random_state=42, n_jobs=-1)
predict_and_plot_confusion_matrix(train_x_onehotCoding, train_y,cv_x_onehotCoding,cv_y, clf)
# --------------------------------
# default parameters
# sklearn.ensemble.RandomForestClassifier(n_estimators=10, criterion=’gini’, max_depth=None, min_samples_split=2,
# min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0,
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False,
# class_weight=None)
# Some of methods of RandomForestClassifier()
# fit(X, y, [sample_weight]) Fit the SVM model according to the given training data.
# predict(X) Perform classification on samples in X.
# predict_proba (X) Perform classification on samples in X.
# some of attributes of RandomForestClassifier()
# feature_importances_ : array of shape = [n_features]
# The feature importances (the higher, the more important the feature).
# --------------------------------
# video link: https://www.appliedaicourse.com/course/applied-ai-course-online/lessons/random-forest-and-their-construction-2/
# --------------------------------
# find more about CalibratedClassifierCV here at http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html
# ----------------------------
# default paramters
# sklearn.calibration.CalibratedClassifierCV(base_estimator=None, method=’sigmoid’, cv=3)
#
# some of the methods of CalibratedClassifierCV()
# fit(X, y[, sample_weight]) Fit the calibrated model
# get_params([deep]) Get parameters for this estimator.
# predict(X) Predict the target of new samples.
# predict_proba(X) Posterior probabilities of classification
#-------------------------------------
# video link:
#-------------------------------------
alpha = [10,50,100,200,500,1000]
max_depth = [2,3,5,10]
cv_log_error_array = []
for i in alpha:
for j in max_depth:
print("for n_estimators =", i,"and max depth = ", j)
clf = RandomForestClassifier(n_estimators=i, criterion='gini', max_depth=j, random_state=42, n_jobs=-1)
clf.fit(train_x_responseCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_responseCoding, train_y)
sig_clf_probs = sig_clf.predict_proba(cv_x_responseCoding)
cv_log_error_array.append(log_loss(cv_y, sig_clf_probs, labels=clf.classes_, eps=1e-15))
print("Log Loss :",log_loss(cv_y, sig_clf_probs))
best_alpha = np.argmin(cv_log_error_array)
clf = RandomForestClassifier(n_estimators=alpha[int(best_alpha/4)], criterion='gini', max_depth=max_depth[int(best_alpha%4)], random_state=42, n_jobs=-1)
clf.fit(train_x_responseCoding, train_y)
sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
sig_clf.fit(train_x_responseCoding, train_y)
predict_y = sig_clf.predict_proba(train_x_responseCoding)
print('For values of best alpha = ', alpha[int(best_alpha/4)], "The train log loss is:",log_loss(y_train, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(cv_x_responseCoding)
print('For values of best alpha = ', alpha[int(best_alpha/4)], "The cross validation log loss is:",log_loss(y_cv, predict_y, labels=clf.classes_, eps=1e-15))
predict_y = sig_clf.predict_proba(test_x_responseCoding)
print('For values of best alpha = ', alpha[int(best_alpha/4)], "The test log loss is:",log_loss(y_test, predict_y, labels=clf.classes_, eps=1e-15))
# let's use a stacking classifier and see how it fares...
clf1 = SGDClassifier(alpha=0.001, penalty='l2', loss='log', class_weight='balanced', random_state=0)
clf1.fit(train_x_onehotCoding, train_y)
sig_clf1 = CalibratedClassifierCV(clf1, method="sigmoid")
clf2 = SGDClassifier(alpha=1, penalty='l2', loss='hinge', class_weight='balanced', random_state=0)
clf2.fit(train_x_onehotCoding, train_y)
sig_clf2 = CalibratedClassifierCV(clf2, method="sigmoid")
clf3 = MultinomialNB(alpha=0.001)
clf3.fit(train_x_onehotCoding, train_y)
sig_clf3 = CalibratedClassifierCV(clf3, method="sigmoid")
sig_clf1.fit(train_x_onehotCoding, train_y)
print("Logistic Regression : Log Loss: %0.2f" % (log_loss(cv_y, sig_clf1.predict_proba(cv_x_onehotCoding))))
sig_clf2.fit(train_x_onehotCoding, train_y)
print("Support vector machines : Log Loss: %0.2f" % (log_loss(cv_y, sig_clf2.predict_proba(cv_x_onehotCoding))))
sig_clf3.fit(train_x_onehotCoding, train_y)
print("Naive Bayes : Log Loss: %0.2f" % (log_loss(cv_y, sig_clf3.predict_proba(cv_x_onehotCoding))))
print("-"*50)
alpha = [0.0001,0.001,0.01,0.1,1,10]
best_alpha = 999
for i in alpha:
lr = LogisticRegression(C=i)
sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3], meta_classifier=lr, use_probas=True)
sclf.fit(train_x_onehotCoding, train_y)
print("Stacking Classifer : for the value of alpha: %f Log Loss: %0.3f" % (i, log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))))
log_error =log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))
if best_alpha > log_error:
best_alpha = log_error
lr = LogisticRegression(C=0.1)
sclf = StackingClassifier(classifiers=[sig_clf1, sig_clf2, sig_clf3], meta_classifier=lr, use_probas=True)
sclf.fit(train_x_onehotCoding, train_y)
log_error = log_loss(train_y, sclf.predict_proba(train_x_onehotCoding))
print("Log loss (train) on the stacking classifier :",log_error)
log_error = log_loss(cv_y, sclf.predict_proba(cv_x_onehotCoding))
print("Log loss (CV) on the stacking classifier :",log_error)
log_error = log_loss(test_y, sclf.predict_proba(test_x_onehotCoding))
print("Log loss (test) on the stacking classifier :",log_error)
print("Number of missclassified point :", np.count_nonzero((sclf.predict(test_x_onehotCoding)- test_y))/test_y.shape[0])
plot_confusion_matrix(test_y=test_y, predict_y=sclf.predict(test_x_onehotCoding))
for key in model_select:
print('----------------------------------------------------\n')
print(key)
for ky,vl in model_select[key].items():
print('{} {}\n'.format(ky,vl) )
len_sent = pd.Series([len(sent) for sent in sentences])
len_sent.describe()
# let's also get word2vec features:
from gensim.models import Word2Vec
%time model = Word2Vec(sentences,min_count = 1,size = 300)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words[:5])
# access vector for one word
print(model['dependent'])
# save model
model.save('cancer_w2v.bin')
# # load model
# new_model = Word2Vec.load('model.bin')
# print(new_model)
model.most_similar('dependent')
result['TEXT'].iloc[null_text_ids]
data_text.head(5)
result.head(2)
# previously for our machine learnig models we have imputed blank text with 'variation + class' ... now lets' undo it
#as that would increase the diff between max(len(text)) and min(len(text)).
text_ids = set(range(len(result)))- set(null_text_ids)
text_ids = [x for x in text_ids]
x = result[['TEXT']].iloc[text_ids,:]
y = result[['Class']].iloc[text_ids,:]
from preprocessing.utils import Preprocess, remove_empty_docs
from model.cnn_document_model import DocumentModel, TrainingParameters
from keras.callbacks import ModelCheckpoint, EarlyStopping
import numpy as np
from keras.utils import to_categorical
import keras.backend as K
<img src = 'DM.png'>
Preprocess.MIN_WD_COUNT=5
Preprocess.SENTENCE_LEN = 30
Preprocess.NUM_SENTENCES = 40
x_trn,x_tst,y_trn,y_tst = train_test_split(x, y, stratify=y, test_size=0.2)
x_trn = x_trn.values
x_trn.shape
x_tst = x_tst.values
x_tst = x_tst.tolist()
x_trn = x_trn.tolist()
x_trn = [doc[0] for doc in x_trn]
x_tst = [doc[0] for doc in x_tst]
preprocessor = Preprocess(corpus=x_trn)
corpus_to_seq = preprocessor.fit()
test_corpus_to_seq = preprocessor.transform(x_tst)
len(corpus_to_seq[0]) # note how it's equal to num_sentences*sentencen_length
vocab = preprocessor.get_vocab()
len(vocab) # total vocabulary
vocab['hi']
cancer_model = DocumentModel(vocab_size=preprocessor.get_vocab_size(),
sent_k_maxpool = 5,
sent_filters = 20,
word_kernel_size = 5,
word_index = preprocessor.word_index,
num_sentences=Preprocess.NUM_SENTENCES,
conv_activation = 'relu',
train_embedding = True,
learn_word_conv = True,
learn_sent_conv = True,
sent_dropout = 0.4,
hidden_dims=64,
input_dropout=0.2,
hidden_gaussian_noise_sd=0.5,
final_layer_kernel_regularizer=0.1,
num_hidden_layers=3,
num_units_final_layer=9)
# save model parameters
train_params = TrainingParameters('cancer_model',
model_file_path = './cancer_model/cancer_model.hdf5',
model_hyper_parameters = './cancer_model/cancer_model.json',
model_train_parameters = './cancer_model/cancer_model_meta.json',
num_epochs=20,
batch_size = 128,
validation_split=.10,
learning_rate=0.01)
train_params.save()
cancer_model._save_model(train_params.model_hyper_parameters)
cancer_model._model.compile(loss="categorical_crossentropy",
optimizer=train_params.optimizer,
metrics=["accuracy"])
y_trn = [(label-1) for label in y_trn['Class'].tolist()]
y_tst = [(label-1) for label in y_tst['Class'].tolist()]
checkpointer = ModelCheckpoint(filepath=train_params.model_file_path,
verbose=1,
save_best_only=True,
save_weights_only=True)
early_stop = EarlyStopping(patience=2)
x_train_dep = np.array(corpus_to_seq)
y_train_dep = to_categorical(np.array(y_trn))
x_test_dep = np.array(test_corpus_to_seq)
y_test_dep = to_categorical(np.array(y_tst))
y_train_dep[0].shape
#Set LR
K.set_value(cancer_model.get_classification_model().optimizer.lr, train_params.learning_rate)
model = cancer_model.get_classification_model()
model.fit(x_train_dep, y_train_dep,
batch_size=train_params.batch_size,
epochs=train_params.num_epochs,
verbose=2,
validation_split=train_params.validation_split,
callbacks=[checkpointer,early_stop])
cancer_model.get_classification_model().evaluate( x_test_dep, y_test_dep, verbose=2)
preds = cancer_model.get_classification_model().predict(x_test_dep)
preds_test = np.argmax(preds, axis=1)
test_labels = y_tst
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
print(classification_report(test_labels, preds_test))
print(confusion_matrix(test_labels, preds_test))
print(accuracy_score(test_labels, preds_test))
# our model without much parameter tuning stuck at val_loss aroung 1.39..and ever our eval-metrics are also not so
#good
# PREPROCESSING
def strip_html_tags(text):
soup = BeautifulSoup(text, "html.parser")
stripped_text = soup.get_text()
return stripped_text
def remove_empty_docs(corpus, labels):
filtered_corpus = []
filtered_labels = []
for doc, label in zip(corpus, labels):
if doc.strip() or len(doc)>30:
filtered_corpus.append(doc)
filtered_labels.append(label)
return filtered_corpus, filtered_labels
class Preprocess:
NUM_SENTENCES = 10
SENTENCE_LEN = 30
MIN_WD_COUNT = 5
MAX_SEQUENCE_LENGTH = SENTENCE_LEN * NUM_SENTENCES
def __init__(self, corpus):
Preprocess.MAX_SEQUENCE_LENGTH = Preprocess.SENTENCE_LEN * Preprocess.NUM_SENTENCES
self.corpus = corpus
def _build_vocab(self):
word_index ={}
for doc in self.corpus:
for sentence in sent_tokenize(doc):
tokens = wordpunct_tokenize(sentence)
tokens = [token.lower().strip() for token in tokens]
tokens = [token for token in tokens if re.match('^[a-z]+$',token) is not None ]
for token in tokens:
word_index[token] = word_index.get(token, 0)+1
filtered_word_index={}
# i= 0 for empty, 1 for OOV
i = 2
for word, count in word_index.items():
if count >= Preprocess.MIN_WD_COUNT :
filtered_word_index[word] = i
i +=1
print('Found %s unique tokens.' % len(filtered_word_index))
return filtered_word_index
def _text2wordindex_seq(self, word_index, corpus):
'''
Splits each doc into sentences and then converts the sentence into a sequence of word indices.
Also, padds short sentences with zeros and short documents with zero sentences.
'''
data = []
doc_count = 0
for doc in corpus:
doc2wordseq = []
sent_num =0
doc_count+=1
if doc_count%1000 == 0 :
percent_processed = doc_count*100/len(corpus)
sys.stdout.write("\r%f%% documents processed." % percent_processed)
sys.stdout.flush()
for sentence in sent_tokenize(doc):
words = wordpunct_tokenize(sentence)
words = [token.lower().strip() for token in words]
word_id_seq = [word_index[word] for word in words if word_index.get(word) is not None]
#word_id_seq = tokenizer.texts_to_sequences([sentence])
padded_word_id_seq = pad_sequences([word_id_seq], maxlen=Preprocess.SENTENCE_LEN,
padding='post',
truncating='post')
if sent_num < Preprocess.NUM_SENTENCES:
doc2wordseq = doc2wordseq + list(padded_word_id_seq[0])
else:
break
sent_num +=1
#incase #sentences in doc is lass than NUM_SENTENCES do post padding
doc2wordseq = pad_sequences([doc2wordseq], maxlen=Preprocess.MAX_SEQUENCE_LENGTH,
padding='post',
truncating='post')
data.append(doc2wordseq[0])
sys.stdout.write("\rAll documents processed." )
return data
def fit(self):
word_index = self._build_vocab()
self.word_index = word_index
self.processed = self._text2wordindex_seq(word_index, self.corpus)
return self.processed
def transform(self, corpus):
return self._text2wordindex_seq(self.word_index, corpus)
def get_vocab_size(self):
if self.word_index:
return len(self.word_index)+2
else:
raise ValueError('fit must be called first to build vocab')
def get_vocab(self):
if self.word_index:
return self.word_index
else:
raise ValueError('fit must be called first to build vocab')
# THE MODEL:
import tensorflow as tf
from keras.layers import Layer, InputSpec
class KMaxPooling(Layer):
"""
K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
TensorFlow backend.
"""
def __init__(self, k=1, **kwargs):
super().__init__(**kwargs)
self.input_spec = InputSpec(ndim=3)
self.k = k
def compute_output_shape(self, input_shape):
return (input_shape[0], (input_shape[2] * self.k))
def call(self, inputs):
# swap last two dimensions since top_k will be applied along the last dimension
shifted_input = tf.transpose(inputs, [0, 2, 1])
# extract top_k, returns two tensors [values, indices]
top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
# return flattened output
return top_k
from keras.layers import Input, Dense, Embedding, Conv1D, Dropout, Concatenate, Lambda, GaussianNoise
from keras.layers.core import Reshape, Flatten, Permute
from keras.models import Model
from keras import regularizers
from model.custom_layer import KMaxPooling
import json
class TrainingParameters:
def __init__(self, model_name ,
model_file_path,
model_hyper_parameters,
model_train_parameters,
seed = 55,
test_data_proportion = 0.3,
batch_size = 64,
num_epochs = 20,
validation_split = 0.05,
optimizer = 'rmsprop',
learning_rate = 0.001):
self.model_name = model_name
self.model_file_path = model_file_path
self.model_hyper_parameters = model_hyper_parameters
self.model_train_parameters = model_train_parameters
self.seed = seed
self.test_data_proportion = test_data_proportion
self.batch_size = batch_size
self.num_epochs = num_epochs
self.validation_split = validation_split
self.optimizer = optimizer
self.learning_rate = learning_rate
def save(self):
with open(self.model_train_parameters, "w", encoding= "utf-8") as file:
json.dump(self.__dict__, file)
class DocumentModel:
def __init__(self, vocab_size,
word_index,
embedding_dim=50,
embedding_weights = None,
embedding_regularizer_l2 = 0.0,
train_embedding=True,
sentence_len=30,
num_sentences=10,
word_kernel_size = 5,
word_filters=30,
sent_kernel_size=5,
sent_filters = 16,
sent_k_maxpool =3 ,
input_dropout = 0,
doc_k_maxpool = 4,
sent_dropout = 0,
hidden_dims = 64,
conv_activation = 'relu',
hidden_activation = 'relu',
hidden_dropout = 0,
num_hidden_layers = 1,
hidden_gaussian_noise_sd = 0.5,
hidden_layer_kernel_regularizer = 0.0,
final_layer_kernel_regularizer = 0.0,
num_units_final_layer = 1,
learn_word_conv=True,
learn_sent_conv=True):
self.vocab_size = vocab_size
self.word_index = word_index
self.embedding_dim = embedding_dim
self.embedding_weights = embedding_weights
self.train_embedding = train_embedding
self.embedding_regularizer_l2 = embedding_regularizer_l2
self.sentence_len = sentence_len
self.num_sentences = num_sentences
self.word_kernel_size = word_kernel_size
self.word_filters = word_filters
self.sent_kernel_size = sent_kernel_size
self.sent_filters = sent_filters
self.sent_k_maxpool = sent_k_maxpool
self.input_dropout = input_dropout
self.doc_k_maxpool = doc_k_maxpool
self.sent_dropout = sent_dropout
self.hidden_dims = hidden_dims
self.conv_activation = conv_activation
self.hidden_activation = hidden_activation
self.hidden_dropout = hidden_dropout
self.num_hidden_layers = num_hidden_layers
self.hidden_gaussian_noise_sd = hidden_gaussian_noise_sd
self.final_layer_kernel_regularizer = final_layer_kernel_regularizer
self.hidden_layer_kernel_regularizer = hidden_layer_kernel_regularizer
self.learn_word_conv = learn_word_conv
self.learn_sent_conv = learn_sent_conv
self.num_units_final_layer=num_units_final_layer
if vocab_size != len(word_index):
print("Vocab Size = {} and the index of vocabulary words passed has {} words".format(vocab_size,len(word_index)))
self._build_model()
self.weights_file = None
def _build_model(self):
max_seq_length = self.sentence_len*self.num_sentences
#Embedding Layer
embedding_layer = Embedding(self.vocab_size,
self.embedding_dim,
input_length=max_seq_length,
trainable=self.train_embedding,
embeddings_regularizer = regularizers.l2(self.embedding_regularizer_l2),
name='imdb_embedding')
if self.embedding_weights is not None:
embedding_layer = Embedding(self.vocab_size,
self.embedding_dim,
weights=[self.embedding_weights],
input_length=max_seq_length,
trainable=self.train_embedding,
embeddings_regularizer = regularizers.l2(self.embedding_regularizer_l2),
name='imdb_embedding')
#input layer : sequence of word indices for each sentence
sequence_input = Input(shape=(max_seq_length,), dtype='int32')
z = embedding_layer(sequence_input)
if self.input_dropout>0:
z = Dropout(self.input_dropout)(z)
conv_blocks = []
i=0
#same convolution filters to be used for all sentences.
word_conv_model = Conv1D(filters=self.word_filters,
kernel_size=self.word_kernel_size,
padding="valid",
activation=self.conv_activation,
trainable = self.learn_word_conv,
name = "word_conv",
strides=1)
for sent in range(self.num_sentences):
#get once sentence from the input
sentence = Lambda(lambda x : x[:,sent*self.sentence_len: (sent+1)*self.sentence_len, :])(z)
conv = word_conv_model(sentence)
conv = KMaxPooling(k=self.sent_k_maxpool)(conv)
#transpose pooled values per sentence
conv = Reshape([self.word_filters*self.sent_k_maxpool,1])(conv)
conv_blocks.append(conv)
#append all sentence convolution feature maps and make sentence embeddings
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
#transform to (steps, input_dim)
z = Permute([2,1], name='sentence_embeddings')(z)
if self.sent_dropout>0:
z = Dropout(self.sent_dropout)(z)
sent_conv = Conv1D(filters=self.sent_filters,
kernel_size=self.sent_kernel_size,
padding="valid",
activation=self.conv_activation,
trainable = self.learn_sent_conv,
name = 'sentence_conv',
strides=1)(z)
z = KMaxPooling(k=self.doc_k_maxpool)(sent_conv)
z = Flatten(name='document_embedding')(z)
if self.hidden_gaussian_noise_sd:
z = GaussianNoise(self.hidden_gaussian_noise_sd)(z)
elif self.hidden_dropout:
z = Dropout(self.hidden_dropout)(z)
for i in range(self.num_hidden_layers):
layer_name = 'hidden_{}'.format(i)
z = Dense(self.hidden_dims, activation=self.hidden_activation, name=layer_name,
kernel_regularizer=regularizers.l2(self.hidden_layer_kernel_regularizer))(z)
output_activation = 'sigmoid'
if self.num_units_final_layer>1:
output_activation = 'softmax'
model_output = Dense(self.num_units_final_layer, activation=output_activation,
kernel_regularizer=regularizers.l2(self.final_layer_kernel_regularizer),
name='final')(z)
self._model = Model(sequence_input, model_output)
def get_document_model(self):
return Model(inputs=self._model.input,
outputs=self._model.get_layer('document_embedding').output)
def get_sentence_model(self):
return Model(inputs=self._model.input,
outputs=self._model.get_layer('sentence_embeddings').output)
def get_classification_model(self):
return self._model
def _save_model(self,file_name):
model_params = {}
for key in self.__dict__.keys():
if key not in ['_model','embedding_weights']:
model_params[key] = self.__dict__[key]
with open(file_name, "w", encoding= "utf-8") as hp_file:
json.dump(model_params, hp_file)
def load_model(file_name):
with open(file_name, "r", encoding= "utf-8") as hp_file:
model_params = json.load(hp_file)
doc_model = DocumentModel( **model_params)
print(model_params)
return doc_model
def load_model_weights(self, model_weights_filename):
self._model.load_weights(model_weights_filename, by_name=True)
References i have used :
from gensim.test.utils import common_texts
import multiprocessing
import gensim
cores = multiprocessing.cpu_count()
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
gensim.models.doc2vec.FAST_VERSION > -1 #otherwise it will be painfully slow....
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
len(x),len(y)
documents = x['TEXT'].tolist()
tag_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
model = Doc2Vec(tag_documents, vector_size=200, window=5, min_count=1, workers=cores)
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
doc1_vec = model.infer_vector(documents[0])
sims = model.docvecs.most_similar([doc1_vec])
sims
#now let's check doc 1867 and 0
documents[0][0:500]
documents[1867][0:500]
text_vec = []
for doc in documents:
text_vec.append(model.infer_vector(doc))
text_vec = np.array(text_vec)
labels = y['Class'].ravel()
txt_trn ,txt_tst,lbl_trn,lbl_tst = train_test_split(text_vec,labels,stratify = labels,test_size = 0.2,random_state = 1)
#let's see if we could use a logistic regression for classification
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(n_jobs = -1)
clf.fit(txt_trn,lbl_trn)
clf.score(txt_trn,lbl_trn) ## similar to our deep model....
y_pred = clf.predict(txt_tst)
accuracy_score(lbl_tst,y_pred) # that's a pretty poor performance but how poor...is it any good compared to
#simple tfidf...
trn_docs,tst_docs,lbl_trn,lbl_tst = train_test_split(documents,labels,stratify = labels,test_size = 0.2,random_state = 1)
vec = TfidfVectorizer()
clf = LogisticRegressionCV(n_jobs = -1) # by default c bt 1e-4 to 1e4 is searched and the best one is fit...
pipe = make_pipeline(vec, clf)
pipe.fit(trn_docs, lbl_trn)
y_pred = pipe.predict(tst_docs)
accuracy_score(lbl_tst,y_pred) #simple countvectorizer based features are working best....
import eli5
eli5.show_weights(clf, vec=vec, top=10)
eli5.show_prediction(clf, tst_docs[0], vec=vec,
) # Dark red shows high contribution for the prediction